@dataknut)knitr::opts_chunk$set(echo = TRUE)
# Set start time ----
startTime <- proc.time()
# Libraries ----
message("Loading libraries...")
## Loading libraries...
library(here) # where are we?
## here() starts at /Users/ben/github/dataknut/botanicals
library(data.table) # data frames with superpowers
library(ggplot2) # pretty plots
library(kableExtra) # pretty tables
library(lubridate) # date & time stuff
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:here':
##
## here
## The following object is masked from 'package:base':
##
## date
library(plotly) # interactive plots
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(skimr) # descriptive & summary stats
# Parameters ----
dPath <- paste0(here::here(), "/data/")
dFile <- paste0(dPath, "ClimateData_1976_2019.csv")
# get data ----
message("Getting data from ", dFile)
## Getting data from /Users/ben/github/dataknut/botanicals/data/ClimateData_1976_2019.csv
weatherDT <- data.table::fread(dFile) # load data
We want to load some UK weather data and have a look at it. We then want to see if we can:
weatherDT$V1 <- NULL # what's the row number for?
# do some pre-processing ----
weatherDT[, rDate := lubridate::dmy(Date)] # make nice date
weatherDT[, year := lubridate::year(rDate)] # make nice year
weatherDT[, month := lubridate::month(rDate, label = TRUE, abbr = TRUE)] # make nice date
t <- head(weatherDT) # what have we got?
kableExtra::kable(t, caption = "First few rows of processed data") %>%
kable_styling()
| Date | maxT | minT | prcpt | rDate | year | month |
|---|---|---|---|---|---|---|
| 1/01/76 | 8.0 | 1.1 | 17.3 | 1976-01-01 | 1976 | Jan |
| 2/01/76 | 9.6 | 0.3 | 41.0 | 1976-01-02 | 1976 | Jan |
| 3/01/76 | 10.3 | 1.3 | 26.6 | 1976-01-03 | 1976 | Jan |
| 4/01/76 | 10.6 | 1.4 | 5.3 | 1976-01-04 | 1976 | Jan |
| 5/01/76 | 9.2 | -0.2 | 18.3 | 1976-01-05 | 1976 | Jan |
| 6/01/76 | 8.1 | 4.7 | 4.7 | 1976-01-06 | 1976 | Jan |
skimr::skim(weatherDT)
| Name | weatherDT |
| Number of rows | 16063 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| Date | 1 |
| factor | 1 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Date | 0 | 1 | 7 | 8 | 0 | 16063 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| rDate | 0 | 1 | 1976-01-01 | 2019-12-31 | 1997-12-27 | 16063 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| month | 0 | 1 | TRUE | 12 | Jan: 1364, Mar: 1364, May: 1364, Jul: 1364 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| maxT | 0 | 1 | 11.55 | 6.02 | -6.5 | 7.1 | 11.4 | 15.9 | 32.70 | ▁▆▇▃▁ |
| minT | 0 | 1 | 5.13 | 4.95 | -14.0 | 1.4 | 5.2 | 9.0 | 25.46 | ▁▃▇▃▁ |
| prcpt | 0 | 1 | 3.65 | 6.30 | 0.0 | 0.0 | 0.7 | 4.9 | 72.20 | ▇▁▁▁▁ |
| year | 0 | 1 | 1997.49 | 12.70 | 1976.0 | 1986.0 | 1997.0 | 2008.0 | 2019.00 | ▇▇▇▇▇ |
Just for fun let’s plot the data. You should be able to hover the mouse over the dots in 3.1 to investigate.
# just for fun
plotDT <- weatherDT[, .(meanPrcpt = mean(prcpt)),
keyby = .(year, month)]
p <- ggplot2::ggplot(plotDT, aes(x = month, y = meanPrcpt, group = year, colour = year)) +
geom_point() +
scale_color_viridis_c() +
labs(y = "Mean precipitation",
caption = "Year treated as continuous for pretty colours")
plotly::ggplotly(p) # interactive plot
Figure 3.1: Mean precipitation per month by year
So that looks vaguely right.
Count the number of spans (per year?) of 15 or more days when the daily rainfall < 0.2. This is the UK definition of drought.
To do this we:
setkey(weatherDT,rDate) # ensures ordered by date
weatherDT[, droughtThresh := ifelse(prcpt < 0.2, 1, 0)] # to make life easier
# flag the start & ends of low precipitation periods
weatherDT[, droughtPeriod := ifelse(droughtThresh == 1 &
shift(droughtThresh == 0), # first of 1 -> n days with low
"Start",
NA)]
weatherDT[, droughtPeriod := ifelse(droughtThresh == 0 &
shift(droughtThresh == 1), # last of 1 -> n days with low
"End",
droughtPeriod)]
# now isolate the start & ends of low precipitation periods
lowPrcptPeriodsDT <- weatherDT[droughtPeriod == "Start" |
droughtPeriod == "End"]
lowPrcptPeriodsDT[, periodCount := ifelse(droughtPeriod == "Start",
shift(rDate, type = "lead") - rDate, # n days between start & end (they are ordered)
NA) # undefined between end & start (we don't care about periods between droughts for now)
]
# flag the droughts using the 14 day threshold
lowPrcptPeriodsDT[, drought := ifelse(periodCount > 14,
"Drought start",
"Not drought")]
plotDT <- lowPrcptPeriodsDT[, .(nDroughts = .N), # data.table magic
keyby = .(drought, year, start = month)] # add up number of droughts/no droughts in these periods of low precip
# save out the low precip periods
outF <- paste0(dPath, "lowPrecipSequences.csv")
data.table::fwrite(lowPrcptPeriodsDT, outF) # includes start and end of sequences
# save out just the flagged droughts
lowPrcptPeriodsDT[, drought := ifelse(droughtPeriod == "End" &
shift(periodCount > 14),
"Drought end",
drought)
] # flag drought end as drought too for data output
outF <- paste0(dPath, "lowPrecipSequencesDroughts.csv")
data.table::fwrite(lowPrcptPeriodsDT[drought %like% "Drought"], outF)
ggplot2::ggplot(lowPrcptPeriodsDT, aes(x = periodCount)) +
geom_histogram() +
labs(x = "Duration of low precipitation periods (sequences of days < 0.2 cm, entire period)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2364 rows containing non-finite values (stat_bin).
Table ?? shows the number of low precipitation events (1 or more days with < 0.2) which were droughts (> 14 days in a row) by start month and year. Perhaps surprisingly 1976 has none.
kableExtra::kable(plotDT[drought == "Drought start"], caption = "Number of low precipitation periods that were droughts by year") %>%
kable_styling()
| drought | year | start | nDroughts |
|---|---|---|---|
| Drought start | 1977 | May | 1 |
| Drought start | 1980 | May | 1 |
| Drought start | 1981 | Aug | 1 |
| Drought start | 1984 | Apr | 1 |
| Drought start | 1984 | Aug | 1 |
| Drought start | 1985 | Oct | 1 |
| Drought start | 1989 | Nov | 1 |
| Drought start | 1991 | Aug | 1 |
| Drought start | 1991 | Nov | 1 |
| Drought start | 1992 | Jun | 1 |
| Drought start | 1995 | Apr | 1 |
| Drought start | 1995 | Jun | 1 |
| Drought start | 1995 | Jul | 1 |
| Drought start | 1997 | May | 1 |
| Drought start | 2000 | May | 1 |
| Drought start | 2002 | Sep | 1 |
| Drought start | 2003 | Mar | 1 |
| Drought start | 2003 | Apr | 1 |
| Drought start | 2011 | Apr | 1 |
| Drought start | 2013 | Jul | 1 |
| Drought start | 2018 | Jun | 1 |
| Drought start | 2019 | Apr | 1 |
Just to make this clear, Figure ?? shows the pattern of number of droughts by start month and year.
p <- ggplot2::ggplot(plotDT[drought == "Drought start"],
aes(x = start, fill = nDroughts, y = year)) +
geom_tile() +
labs(caption = "Note: months without droughts (Jan/Feb & Dec) not shown")
p
Figure 4.1: Number of droughts by start month and year
If this chart in not what you expect then remember:
Of note:
We can also plot the drought spans themselves (which might have been more useful in any case :-) - see Figure 4.3.
droughtsDT <- lowPrcptPeriodsDT[drought %like% "Drought"]
ggplot2::ggplot(droughtsDT, aes(x = periodCount)) +
geom_histogram() +
labs(x = "Drought duration (days) - whole period")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).
Figure 4.2: Drought periods histogram
t <- droughtsDT[, .(meanDuration = mean(periodCount, na.rm = TRUE),
minDuration = min(periodCount, na.rm = TRUE),
maxDuration = max(periodCount, na.rm = TRUE))]
kableExtra::kable(t, caption = "Duration stats (all years)") %>%
kable_styling()
| meanDuration | minDuration | maxDuration |
|---|---|---|
| 18.18182 | 15 | 21 |
droughtsDT[, startDate := rDate]
droughtsDT[, endDate := shift(rDate, type = "lead")]
# so how do we plot that?
plotDT <- droughtsDT[drought == "Drought start"]
plotDT[, startDec := lubridate::date_decimal(2000 + (lubridate::decimal_date(startDate) - year))]
plotDT[, endDec := lubridate::date_decimal(2000 + (lubridate::decimal_date(endDate) - year))]
p <- ggplot2::ggplot(plotDT, aes(x = startDec, xend = endDec,
y = year, yend = year)) +
geom_segment(colour = "red") +
# theme(legend.position="bottom") +
# guides(colour = guide_legend(title = "Year", nrow = 2)) +
scale_y_continuous(breaks = c(1975, 1980, 1985,1990,1995,2000,2005,2010,2015,2020)) +
labs(x = "Date") +
scale_x_datetime(date_breaks = "1 month", date_labels = "%b %d")
p +
geom_text(
label= plotDT$year,
size = 3,
nudge_y = 0.5,
check_overlap = T
)
Figure 4.3: Drought periods by year
Is that what we expect? The periods appear visually similar in length but this just reflects the tight distribution of durations (see Figure 4.2).
t <- proc.time() - startTime
elapsed <- t[[3]]
Analysis completed in 9.699 seconds ( 0.16 minutes) using knitr in RStudio with R version 3.6.3 (2020-02-29) running on x86_64-apple-darwin15.6.0.
R packages used:
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_NZ.UTF-8/en_NZ.UTF-8/en_NZ.UTF-8/C/en_NZ.UTF-8/en_NZ.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] skimr_2.1 plotly_4.9.2 lubridate_1.7.4 kableExtra_1.1.0
## [5] ggplot2_3.3.0 data.table_1.12.8 here_0.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.0.0 xfun_0.12 repr_1.1.0 purrr_0.3.3
## [5] colorspace_1.4-1 vctrs_0.2.3 htmltools_0.4.0 viridisLite_0.3.0
## [9] yaml_2.2.1 base64enc_0.1-3 rlang_0.4.5 later_1.0.0
## [13] pillar_1.4.3 glue_1.3.1 withr_2.1.2 lifecycle_0.2.0
## [17] stringr_1.4.0 munsell_0.5.0 gtable_0.3.0 rvest_0.3.5
## [21] htmlwidgets_1.5.1 evaluate_0.14 labeling_0.3 knitr_1.28
## [25] fastmap_1.0.1 httpuv_1.5.2 crosstalk_1.0.0 highr_0.8
## [29] Rcpp_1.0.3 xtable_1.8-4 readr_1.3.1 promises_1.1.0
## [33] scales_1.1.0 backports_1.1.5 webshot_0.5.2 jsonlite_1.6.1
## [37] mime_0.9 farver_2.0.3 hms_0.5.3 digest_0.6.25
## [41] stringi_1.4.6 shiny_1.4.0 bookdown_0.18 dplyr_0.8.5
## [45] grid_3.6.3 rprojroot_1.3-2 tools_3.6.3 magrittr_1.5
## [49] lazyeval_0.2.2 tibble_2.1.3 crayon_1.3.4 tidyr_1.0.2
## [53] pkgconfig_2.0.3 xml2_1.2.2 assertthat_0.2.1 rmarkdown_2.1
## [57] httr_1.4.1 rstudioapi_0.11 R6_2.4.1 compiler_3.6.3
Arino de la Rubia, Eduardo, Hao Zhu, Shannon Ellis, Elin Waring, and Michael Quinn. 2017. Skimr: Skimr. https://github.com/ropenscilabs/skimr.
Dowle, M, A Srinivasan, T Short, S Lianoglou with contributions from R Saporta, and E Antonyan. 2015. Data.table: Extension of Data.frame. https://CRAN.R-project.org/package=data.table.
Grolemund, Garrett, and Hadley Wickham. 2011. “Dates and Times Made Easy with lubridate.” Journal of Statistical Software 40 (3): 1–25. http://www.jstatsoft.org/v40/i03/.
Müller, Kirill. 2017. Here: A Simpler Way to Find Your Files. https://CRAN.R-project.org/package=here.
R Core Team. 2016. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.
Wickham, Hadley. 2009. Ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. http://ggplot2.org.
Xie, Yihui. 2016. Knitr: A General-Purpose Package for Dynamic Report Generation in R. https://CRAN.R-project.org/package=knitr.
Zhu, Hao. 2018. KableExtra: Construct Complex Table with ’Kable’ and Pipe Syntax. https://CRAN.R-project.org/package=kableExtra.